1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(RColorBrewer)
  library(scatterplot3d)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_delim(file = here("doc/Samples.tsv"), 
                      delim="\t",
                      col_types=cols(
                        SampleID=col_factor(),
                        Time=col_factor(),
                        Replicate=col_factor())
                      ) %>% 
  mutate(Time=relevel(Time,"std"))

2 Raw data

filelist <- list.files(here("data/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sort the raw data and samples

samples <- samples[match(basename(dirname(filelist)),samples$SampleID),]
stopifnot(all(samples$SampleID == basename(dirname(filelist))))

name the file list vector

names(filelist) <- samples$SampleID

Read the expression at the transcript level (we have no gene information)

counts <- suppressMessages(round(tximport(files = filelist, 
                                  type = "salmon",txOut = TRUE)$counts))

Read the algae IDs

IDs <- scan(here("data/analysis/annotation/algae-IDs.txt"),
            what = "character")

Subset the data

counts <- counts[IDs,]

2.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "4.5% percent (2244) of 49477 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by Time
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat,aes(x,y,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),
        axis.title.x=element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 2244 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by Time

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Time=samples$Time[match(ind,samples$SampleID)])

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 415722 rows containing non-finite values (stat_density).

2.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Time)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
B2_2_120hrs_A B2_2_120hrs_B B2_2_120hrs_C B2_2_120hrs_D B2_2_12hrs_A
1.122 1.115 1.211 1.009 0.9364
Table continues below
B2_2_12hrs_B B2_2_12hrs_C B2_2_12hrs_D B2_2_24hrs_A B2_2_24hrs_B
0.9092 0.85 1.327 1.076 1.287
Table continues below
B2_2_24hrs_C B2_2_24hrs_D B2_2_4hrs_A B2_2_4hrs_B B2_2_4hrs_C
1.162 0.9715 0.7666 0.681 0.757
Table continues below
B2_2_4hrs_D B2_2_60minA B2_2_60minB B2_2_60minC B2_2_60minD
0.7047 1.03 1.327 1.128 1.239
Table continues below
B2_2_72hrs_A B2_2_72hrs_B B2_2_72hrs_C B2_2_72hrs_D B2_2_std_A
1.188 1.236 0.8674 0.9216 1.238
B2_2_std_B B2_2_std_C B2_2_std_D
1.094 1.088 1.325
boxplot(sizes, main="Sequencing libraries size factor")

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(vst)>0,])

3.3 QC on the normalised data

3.3.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=1

An the number of possible combinations

nlevel=nlevels(dds$Time)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

3.3.2 3 first dimensions

mar=c(5.1,4.1,4.1,2.1)

The PCA shows that a large fraction of the variance is explained by both variables.

scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(dds$Time)],pch=19)
legend("topleft",
       fill=pal[1:nlevels(dds$Time)],
       legend=levels(dds$Time))

par(mar=mar)

3.3.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples)

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Replicate,text=SampleID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

3.3.4 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst,
                           conditions=dds$Time,
                           nrep=3)

vst.cutoff <- 3
  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = dds$Time,
          col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="")

3.4 Conclusion

The data looks good. Clearly, there has been a sample swap between B2_2_24hrs_D and B2_2_12hrs_D.

D12p <- which(dds$SampleID == "B2_2_12hrs_D")
D24p <- which(dds$SampleID == "B2_2_24hrs_D")
colData(dds)[D12p,"Time"] <- "24hrs"
colData(dds)[D24p,"Time"] <- "12hrs"
samples[D12p,"Time"] <- "24hrs"
samples[D24p,"Time"] <- "12hrs"

save(dds,file=here("data/analysis/salmon/dds-sample-swap-corrected.rda"))
write_tsv(samples,path = here("doc/Samples-swap-corrected.tsv"))

4 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.54.0                  tximport_1.14.0            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.3                 purrr_0.3.3                
##  [7] readr_1.3.1                 tidyr_1.0.0                
##  [9] tibble_2.1.3                tidyverse_1.3.0            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.9.1                pander_0.6.3               
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             here_0.1                   
## [19] gplots_3.0.1.1              DESeq2_1.26.0              
## [21] SummarizedExperiment_1.16.0 DelayedArray_0.12.0        
## [23] BiocParallel_1.20.0         matrixStats_0.55.0         
## [25] Biobase_2.46.0              GenomicRanges_1.38.0       
## [27] GenomeInfoDb_1.22.0         IRanges_2.20.1             
## [29] S4Vectors_0.24.1            BiocGenerics_0.32.0        
## [31] data.table_1.12.8          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       rprojroot_1.3-2        htmlTable_1.13.3      
##   [4] XVector_0.26.0         base64enc_0.1-3        fs_1.3.1              
##   [7] rstudioapi_0.10        hexbin_1.28.0          farver_2.0.1          
##  [10] affyio_1.56.0          bit64_0.9-7            fansi_0.4.0           
##  [13] AnnotationDbi_1.48.0   lubridate_1.7.4        xml2_1.2.2            
##  [16] splines_3.6.1          geneplotter_1.64.0     knitr_1.26            
##  [19] zeallot_0.1.0          Formula_1.2-3          jsonlite_1.6          
##  [22] broom_0.5.3            annotate_1.64.0        cluster_2.1.0         
##  [25] dbplyr_1.4.2           shiny_1.4.0            BiocManager_1.30.10   
##  [28] compiler_3.6.1         httr_1.4.1             backports_1.1.5       
##  [31] fastmap_1.0.1          assertthat_0.2.1       Matrix_1.2-18         
##  [34] lazyeval_0.2.2         limma_3.42.0           cli_2.0.0             
##  [37] later_1.0.0            acepack_1.4.1          htmltools_0.4.0       
##  [40] tools_3.6.1            affy_1.64.0            gtable_0.3.0          
##  [43] glue_1.3.1             GenomeInfoDbData_1.2.2 Rcpp_1.0.3            
##  [46] cellranger_1.1.0       vctrs_0.2.1            preprocessCore_1.48.0 
##  [49] gdata_2.18.0           nlme_3.1-143           crosstalk_1.0.0       
##  [52] xfun_0.11              rvest_0.3.5            testthat_2.3.1        
##  [55] mime_0.7               lifecycle_0.1.0        gtools_3.8.1          
##  [58] XML_3.98-1.20          zlibbioc_1.32.0        scales_1.1.0          
##  [61] promises_1.1.0         hms_0.5.2              yaml_2.2.0            
##  [64] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [67] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.1.4         
##  [70] highr_0.8              genefilter_1.68.0      checkmate_1.9.4       
##  [73] caTools_1.17.1.3       rlang_0.4.2            pkgconfig_2.0.3       
##  [76] bitops_1.0-6           evaluate_0.14          labeling_0.3          
##  [79] htmlwidgets_1.5.1      bit_1.1-14             tidyselect_0.2.5      
##  [82] magrittr_1.5           R6_2.4.1               generics_0.0.2        
##  [85] Hmisc_4.3-0            DBI_1.1.0              pillar_1.4.2          
##  [88] haven_2.2.0            foreign_0.8-72         withr_2.1.2           
##  [91] survival_3.1-8         RCurl_1.95-4.12        nnet_7.3-12           
##  [94] modelr_0.1.5           crayon_1.3.4           KernSmooth_2.23-16    
##  [97] rmarkdown_2.0          locfit_1.5-9.1         readxl_1.3.1          
## [100] blob_1.2.0             reprex_0.3.0           digest_0.6.23         
## [103] xtable_1.8-4           httpuv_1.5.2           munsell_0.5.0         
## [106] viridisLite_0.3.0